library(readxl)
library(dplyr)
library(ggplot2)
library(plotly)
library(jmv)
library(leaflet)
library(plotrix)
library(httr)
library(rvest)
library(sqldf)STAT 570 Final Project
Dataset of Exceptional Women Directors and Carbon Information of Global Energy Companies
The data used in this project gives valuable information about the EWD( Exceptional Women Director) score and CID(Carbon Information Disclosures) score of international leading energy companies between the years 2018 and 2020. In order to create a more sustainable future, the data is helpful for researchers examining women’s participation in net-zero emissions, gender equality, climate resilience, renewable energy, and energy transition in corporate boardrooms.

According to the linked study article by Majid and Jaaffar (2023), the data includes 97 companies with 291 observations from Thomson Reuters listings. Purposive random sampling was used to choose the companies for the sample based on the data collected.

REFERENCE
Majid, N. A., & Jaaffar, A. H. (2023). The effect of women’s leadership on carbon disclosure by the top 100 global energy leaders. Sustainability, 15(11), 8491. https://doi.org/10.3390/su15118491
Reading and Cleaning the Data
Necessary Libraries
Downloading and Reading the Data
First, we define the working directory (optional), then we define the link for the zip file of the dataset. After,
we set out the local file name for this zip file, then we download it. Afterwards, we unzip the file and define the names of the excel files.
#defining working directory
setwd("/Users/mehmeterkan/Desktop")
#defining the link for the data
zipF <- "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/d2s9yz65mm-4.zip"
#set out the local file name for the zip file
local_zip_filename <- "Dataset of Women Directors Engagement.zip"
#downloading the data
download.file(zipF, local_zip_filename, mode = "wb", method = "auto")
#unzip the file
unzip(local_zip_filename)
#defining the name of the excel files
excel_files <- c("CID Scores (Table A).xlsx", "WDs Engagement (Table B).xlsx", "EWDs Aggregated Score (Table C).xlsx")Reading the Each Excel File
CID Scores (Table A)

This excel sheet includes information about binary and total CID scores of the companies between the years 2018 and 2020. First of all, we read the CID Scores excel file and read each sheet separately.
#defining working directory as the file name
setwd("/Users/mehmeterkan/Desktop/Dataset of Women Directors’ Engagement and Carbon Information Disclosures of Global Energy Companies")
#reading each sheet in the excel file
CID_2018 <- read_excel("CID Scores (Table A).xlsx", sheet = "FYE 2018")
CID_2019 <- read_excel("CID Scores (Table A).xlsx", sheet = "FYE 2019")
CID_2020 <- read_excel("CID Scores (Table A).xlsx", sheet = "FYE 2020")After reading the sheets, we found out some empty columns, and delete them. After that, we define the column names for the tables. We add year as a new variable into tables and combine all three table into one table. Lastly, we convert variables into numeric except company_name and year.
#deleting unnecessary rows
CID_2018 <- CID_2018[-c(1,2),]
CID_2019 <- CID_2019[-c(1,2),]
CID_2020 <- CID_2020[-c(1,2),]
#setting common column names
set_column_names <- function(data, year) {
colnames(data) <- c("company_name", "strategy_policy", "climate_change_opportunities",
"corporate_ghg_emissions_targets", "company_wide_carbon_footprint",
"ghg_emissions_change_over_time", "energy_related_reporting",
"emission_reduction_initiatives_implementation",
"carbon_emission_accountability", "quality_of_disclosure",
paste0("quality_of_DisclosureTotal_cid_scores"),"year")
data$year <- year
return(data)
}
CID_2018 <- set_column_names(CID_2018,"2018")
CID_2019 <- set_column_names(CID_2019,"2019")
CID_2020 <- set_column_names(CID_2020,"2020")
#combining all the tables into one table
CID_scores <- rbind(CID_2018, CID_2019,CID_2020)
#converting variables to numeric
CID_scores <- CID_scores %>%
mutate_at(vars(-company_name, -year), as.numeric)CID Scores Table After Cleaning
head(CID_scores,10)# A tibble: 10 × 12
company_name strategy_policy climate_change_oppor…¹ corporate_ghg_emissi…²
<chr> <dbl> <dbl> <dbl>
1 Acea SpA 7 5 1
2 Aker Solutions 6 5 0
3 Amec Foster Wh… 6 5 2
4 Avangrid 6 5 4
5 Bharat Petrole… 7 5 3
6 BP 6 5 3
7 Cairn India 7 5 4
8 Cameco 6 5 0
9 Canadian Natur… 5 5 2
10 Chevron Corpor… 7 5 4
# ℹ abbreviated names: ¹climate_change_opportunities,
# ²corporate_ghg_emissions_targets
# ℹ 8 more variables: company_wide_carbon_footprint <dbl>,
# ghg_emissions_change_over_time <dbl>, energy_related_reporting <dbl>,
# emission_reduction_initiatives_implementation <dbl>,
# carbon_emission_accountability <dbl>, quality_of_disclosure <dbl>,
# quality_of_DisclosureTotal_cid_scores <dbl>, year <chr>
WDs Engagement (Table B)

This excel file contains information about the percentage of WD’s involved on the board and according to their classification between 2018 and 2020. First of all, we read the WDs Engagement excel file and read each sheet separately.
#defining working directory as the file name
setwd("/Users/mehmeterkan/Desktop/Dataset of Women Directors’ Engagement and Carbon Information Disclosures of Global Energy Companies")
#reading each sheet one by one
WDs_2018 <- read_xlsx("WDs Engagement (Table B).xlsx", sheet = "FYE 2018")
WDs_2019 <- read_xlsx("WDs Engagement (Table B).xlsx", sheet = "FYE 2019")
WDs_2020 <- read_xlsx("WDs Engagement (Table B).xlsx", sheet = "FYE 2020")After reading the sheets, we found out some empty columns, and delete them. After that, we define the column names for the tables. We add year as a new variable into tables and combine all three table into one table. Then, we convert columns to numeric expect the company_name and year column.
#deleting uncesessary rows
WDs_2018 <- WDs_2018[-1,]
WDs_2019 <- WDs_2019[-1,]
WDs_2020 <- WDs_2020[-1,]
#setting common column names
set_column_names_2 <- function(data, year) {
colnames(data) <- c("company_name", "number_of_wd", "per_of_wd_on_board",
"per_of_wd_industry_expert",
"per_of_wd_advisors","per_of_wd_community_leader")
data$year <- year
return(data)
}
WDs_2018 <- set_column_names_2(WDs_2018,"2018")
WDs_2019 <- set_column_names_2(WDs_2019,"2019")
WDs_2020 <- set_column_names_2(WDs_2020,"2020")
#combining all the tables in one table
WDs_engagement <- rbind(WDs_2018, WDs_2019, WDs_2020)
#converting columns to numeric (excluding company_name,year)
WDs_engagement <- WDs_engagement |>
mutate_at(vars(-company_name,-year), function(x) round(as.numeric(sub("%", "", x)), 2))WDs Engagement Table After Cleaning
head(WDs_engagement,10)# A tibble: 10 × 7
company_name number_of_wd per_of_wd_on_board per_of_wd_industry_e…¹
<chr> <dbl> <dbl> <dbl>
1 Acea SpA 4 0.44 0.44
2 Aker Solutions 5 0.45 0.45
3 Amec Foster Wheeler (… 3 0.33 0.33
4 Avangrid 4 0.29 0.29
5 Bharat Petroleum 0 0 0
6 BP 5 0.42 0.33
7 Cairn India 2 0.2 0.1
8 Cameco 3 0.33 0.33
9 Canadian Natural Reso… 5 0.42 0.33
10 Chevron Corporation 4 0.36 0.36
# ℹ abbreviated name: ¹per_of_wd_industry_expert
# ℹ 3 more variables: per_of_wd_advisors <dbl>,
# per_of_wd_community_leader <dbl>, year <chr>
EWDs Aggregated Scores (Table C)

This file has the information about EWD’s engagement scores are indicated by the total score across the four EWD parameters. First of all, we read the EWDs Aggregated Scores excel file and read each sheet separately.
#defining working directory as the file name
setwd("/Users/mehmeterkan/Desktop/Dataset of Women Directors’ Engagement and Carbon Information Disclosures of Global Energy Companies")
#reading each sheet in the excel file
EWDs_2018 <- read_xlsx("EWDs Aggregated Score (Table C).xlsx", sheet = "FYE 2018")
EWDs_2019 <- read_xlsx("EWDs Aggregated Score (Table C).xlsx", sheet = "FYE 2019")
EWDs_2020 <- read_xlsx("EWDs Aggregated Score (Table C).xlsx", sheet = "FYE 2020")we define the column names for the tables. We add year as a new variable into tables.Then, we replace name in empty spaces and combine all three table into one table.
#setting common column names
set.column.names <- function(data,year) {
colnames(data) <- c("company_name", "code", "WDsName", "industry_expert",
"advisor", "community_leaders", "energy_experiments", "log_of_energy_experiment")
data$year <- year
return(data)
}
EWDs_2018 <- set.column.names(EWDs_2018,"2018")
EWDs_2019 <- set.column.names(EWDs_2019,"2019")
EWDs_2020 <- set.column.names(EWDs_2020,"2020")
#replacing name in empty space
for (i in 1:4) {EWDs_2018 <- EWDs_2018 |>
mutate('company_name' = ifelse(is.na(company_name), lag(company_name), company_name))}
for (i in 1:5) {EWDs_2019 <- EWDs_2019 |>
mutate('company_name' = ifelse(is.na(company_name), lag(company_name), company_name))}
for (i in 1:6) {EWDs_2020 <- EWDs_2020 |>
mutate('company_name' = ifelse(is.na(company_name), lag(company_name), company_name))}
#combining all the tables into one table
EWDs_scores <- rbind(EWDs_2018,EWDs_2019,EWDs_2020)Then, we manipulate the data and add a new column wd_title based on the energy experiments of the directors.
#converting energy_experiments variable into numeric
EWDs_scores$energy_experiments <- as.numeric(EWDs_scores$energy_experiments)
#adding new column as wd_title accoring to the energy_experiments
EWDs_scores <- EWDs_scores |>
mutate(wd_title = cut(energy_experiments, breaks = c(-Inf, 0 ,15, 30, 49), labels = c("No Experience","Assistant Director", "Director","Senior Director"), include.lowest = TRUE))EWDs Scores Table After Cleaning
head(EWDs_scores,10)# A tibble: 10 × 10
company_name code WDsName industry_expert advisor community_leaders
<chr> <chr> <chr> <chr> <chr> <chr>
1 Acea SpA ACEA… Michae… 1 1 0
2 Acea SpA ACEA… Gabrie… 1 1 1
3 Acea SpA ACEA… Lilian… 1 1 1
4 Aker Solutions AKER… Birgit… 1 1 0
5 Aker Solutions AKER… Koosum… 1 1 1
6 Aker Solutions AKER… Anne D… 1 1 0
7 Aker Solutions AKER… Hilde … 1 0 1
8 Amec Foster Wheeler … AMEC… Jacqui… 1 1 1
9 Amec Foster Wheeler … AMEC… Linda … 1 1 0
10 Amec Foster Wheeler … AMEC… Jann B… 1 1 1
# ℹ 4 more variables: energy_experiments <dbl>, log_of_energy_experiment <dbl>,
# year <chr>, wd_title <fct>
Descriptive Statistics
CID Scores Table
descriptives(
data = CID_scores,
vars = c("strategy_policy", "climate_change_opportunities", "corporate_ghg_emissions_targets", "company_wide_carbon_footprint",
"ghg_emissions_change_over_time", "energy_related_reporting", "emission_reduction_initiatives_implementation",
"carbon_emission_accountability","quality_of_disclosure", "quality_of_DisclosureTotal_cid_scores"),
freq = TRUE,
hist = TRUE,
dens = TRUE,
sd = TRUE)
DESCRIPTIVES
Descriptives
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
strategy_policy climate_change_opportunities corporate_ghg_emissions_targets company_wide_carbon_footprint ghg_emissions_change_over_time energy_related_reporting emission_reduction_initiatives_implementation carbon_emission_accountability quality_of_disclosure quality_of_DisclosureTotal_cid_scores
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
N 291 291 291 291 291 291 291 291 291 291
Missing 0 0 0 0 0 0 0 0 0 0
Mean 6.484536 4.845361 2.769759 9.364261 2.718213 8.350515 17.31615 5.762887 10.48110 68.19244
Median 7.000000 5.000000 3.000000 10.00000 3.000000 9.000000 18.00000 6.000000 11.00000 71.00000
Standard deviation 1.263111 0.8671023 1.307240 3.167392 1.752725 2.949136 3.571397 1.064537 4.267215 14.89541
Minimum 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Maximum 7.000000 5.000000 5.000000 13.00000 6.000000 11.00000 20.00000 6.000000 17.00000 87.00000
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
WDs Engagement Table
descriptives(
data = WDs_engagement,
vars = c("number_of_wd", "per_of_wd_on_board",
"per_of_wd_industry_expert",
"per_of_wd_advisors","per_of_wd_community_leader"),
freq = TRUE,
hist = TRUE,
dens = TRUE,
sd = TRUE)
DESCRIPTIVES
Descriptives
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
number_of_wd per_of_wd_on_board per_of_wd_industry_expert per_of_wd_advisors per_of_wd_community_leader
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
N 291 291 291 291 291
Missing 0 0 0 0 0
Mean 2.762887 0.2376632 0.2054296 0.2030241 0.1318900
Median 3.000000 0.2500000 0.2000000 0.2000000 0.1000000
Standard deviation 1.752475 0.1417478 0.1430869 0.1424071 0.1154849
Minimum 0.000000 0.000000 0.000000 0.000000 0.000000
Maximum 8.000000 0.6000000 0.5600000 0.6000000 0.5000000
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
EWDs Scores Table
descriptives(
data = EWDs_scores,
vars = c("industry_expert","advisor", "community_leaders", "energy_experiments", "log_of_energy_experiment"),
freq = TRUE,
hist = TRUE,
dens = TRUE,
sd = TRUE)
DESCRIPTIVES
Descriptives
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
industry_expert advisor community_leaders energy_experiments log_of_energy_experiment
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
N 832 832 832 807 734
Missing 0 0 0 25 98
Mean 9.434944 1.782397
Median 5.000000 1.791759
Standard deviation 10.65360 1.109346
Minimum 0.000000 0.000000
Maximum 49.00000 3.891820
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Analysis
What are the companies with the highest average quality of disclosure total of CID scores of three years 2018,2019,2020?
#finding the mean cid_scores of three years for each company and slicing the highest ten
mean_cid <- CID_scores %>%
filter(year %in% c("2018", "2019", "2020")) %>%
group_by(company_name) %>%
summarize(avg_cid_score = mean(quality_of_DisclosureTotal_cid_scores, na.rm = TRUE)) |>
slice_max(order_by=avg_cid_score,n=10)
mean_cid# A tibble: 10 × 2
company_name avg_cid_score
<chr> <dbl>
1 Hera 83.3
2 Fairmount Santrol 83
3 Encana 80.7
4 Formosa Petrochemical Corporation 80
5 Hellenic Petroleum 79.7
6 Acea SpA 79.3
7 DCC 78.7
8 Hess Corporation 78.7
9 PKN ORLEN 78.7
10 Suncor Energy 78.7
plot_ly(data = mean_cid, x = ~company_name, y = ~avg_cid_score, type = 'bar', color = ~avg_cid_score) %>%
layout(title = "Top 10 Companies by Average Score",
xaxis = list(title = "Company Name", categoryorder = 'total descending'),
yaxis = list(title = "Average Score", range = c(75, 85)))How does the percentage of experience levels of women directors change between the years 2018 and 2020?
#creating proportion table for year 2018
EWDs_scores_2018 <- EWDs_scores |> select(wd_title, year) |>
filter(year == "2018") |> na.omit()
tab1 <- prop.table(table(EWDs_scores_2018$wd_title))
df1 <- data.frame(wd_title=names(tab1), proportion=as.numeric(tab1))
#creating proportion table for year 2019
EWDs_scores_2019 <- EWDs_scores |> select(wd_title, year) |>
filter(year == "2019") |> na.omit()
tab2 <- prop.table(table(EWDs_scores_2019$wd_title))
df2 <- data.frame(wd_title=names(tab2), proportion=as.numeric(tab2))
#creating proportion table for year 2020
EWDs_scores_2020 <- EWDs_scores |> select(wd_title, year) |>
filter(year == "2020") |> na.omit()
tab3 <- prop.table(table(EWDs_scores_2020$wd_title))
df3 <- data.frame(wd_title=names(tab3), proportion=as.numeric(tab3))par(mfrow=c(1,3))
pie3D(df1$proportion, labels = round(df1$proportion,2),
main = "Percentage of Experience\nLevel of Women Directors in 2018", col = rainbow(length(df1$proportion)))
legend("topright", c("No Experience", "Assistant Director", "Director", "Senior Director"),
cex = 0.5, fill = rainbow(length(df1$proportion)))
pie3D(df2$proportion, labels = round(df2$proportion,2),
main = "Percentage of Experience\nLevel of Women Directors in 2019", col = rainbow(length(df2$proportion)))
legend("topright", c("No Experience", "Assistant Director", "Director", "Senior Director"),
cex = 0.5, fill = rainbow(length(df2$proportion)))
pie3D(df3$proportion, labels = round(df3$proportion,2),
main = "Percentage of Experience\nLevel of Women Directors in 2020", col = rainbow(length(df3$proportion)))
legend("topright", c("No Experience", "Assistant Director", "Director", "Senior Director"),
cex = 0.5, fill = rainbow(length(df3$proportion)))What are the number of energy companies of the countries and continents?
company <- read.csv('/Users/mehmeterkan/Desktop/company.csv', sep = ";")
company <- company[,-1]
table(company$Continent)
Africa Asia Australia Europe North America
1 25 3 42 24
South America
2
url_ulke <- "https://developers.google.com/public-data/docs/canonical/countries_csv?hl=en"
res <- GET(url_ulke)
html_con <- content(res, "text")
html_ulke <- read_html(html_con)
# Extract all tables from the webpage
tables <- html_ulke %>%
html_nodes("table") %>%
html_table()
long_lat_country <- tables[[1]]
long_lat_country <- long_lat_country[,-1]
colnames(long_lat_country)[3] <- "Country"
toplu_son_mu <- merge(x = company, y = long_lat_country, by = "Country", all.x = TRUE)
df <- sqldf::sqldf("SELECT Country, continent, latitude, longitude, COUNT(*) AS Freq
FROM toplu_son_mu
GROUP BY Country
ORDER BY Freq DESC")
df_son <- (sqldf::sqldf("SELECT Freq,A.* FROM df B LEFT JOIN toplu_son_mu A ON A.latitude=B.latitude"))
center_lon <- median(df$longitude, na.rm = TRUE)
center_lat <- median(df$lattitude, na.rm = TRUE)
getColor <- function(df) {
sapply(df$Freq, function(mag) {
if(mag <= 2) {
"green"
} else if(mag <= 4) {
"orange"
} else {
"red"
} })
}
icons <- awesomeIcons(
icon = 'ios-close',
iconColor = 'black',
library = 'ion',
markerColor = getColor(df)
)
str(df)'data.frame': 31 obs. of 5 variables:
$ Country : chr "United States" "United Kingdom" "Italy" "India" ...
$ Continent: chr "North America" "Europe" "Europe" "Asia" ...
$ latitude : num 37.1 55.4 41.9 20.6 46.2 ...
$ longitude: num -95.71 -3.44 12.57 78.96 2.21 ...
$ Freq : int 19 8 7 7 5 5 4 4 3 3 ...
leaflet() %>%
addProviderTiles(providers$Esri.WorldStreetMap,
options = providerTileOptions(noWrap = TRUE)) %>%
addAwesomeMarkers(
data = df_son,
lng = ~longitude,
lat = ~latitude,
label = ~Country,
icon = icons,
popup = ~paste("<br>Number of Company:", Freq,
"<br>Company Names:", company_name),
clusterOptions = markerClusterOptions()
)